Title level 1

Final copy

Title level 2

bold

italics

Load packages

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
source("functions.R")

I downloaded the file and loaded it into R

download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134

I wonder if rstats increases life expectancy over the years

p <- ggplot(data=gapminder,aes(x=year,y=lifeExp)) +
    geom_point()

p

Let’s see the interactive version

ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Making your own functions

If you are repeating yourself in your code, you may be able to solve that problem by making your own function!

se <- function(x){
  sd(x)/sqrt(length(x))
}

cars <- c(3,4,5,6,7,10)

se(cars)
## [1] 1.013794

Data manipulation with dplyr

You will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!

Explored select

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

year_country_gdp <- select(gapminder,year,country,gdpPercap)
year_country_gdp <- select(gapminder,-pop, -continent, -lifeExp)
names(year_country_gdp)
## [1] "country"   "year"      "gdpPercap"

Explore filter Pike (%>%) = first argument of the function (shortcut: control shift m)

year_country_gdp_euro <- gapminder %>%
  filter(continent=="Europe") %>%
  select(year,country,gdpPercap)

Exploring the amazing group_by and summarize functions

mean_gdp_percountry <- gapminder %>%
  group_by(country) %>% 
  summarise(mean_gdp=mean(gdpPercap),
            se_gdp=se(gdpPercap))

mean_gdp_percountry
## # A tibble: 142 x 3
##        country   mean_gdp     se_gdp
##         <fctr>      <dbl>      <dbl>
##  1 Afghanistan   802.6746   31.23550
##  2     Albania  3255.3666  344.20223
##  3     Algeria  4426.0260  378.26190
##  4      Angola  3607.1005  336.56641
##  5   Argentina  8955.5538  537.68144
##  6   Australia 19980.5956 2256.11315
##  7     Austria 20411.9163 2787.23968
##  8     Bahrain 18077.6639 1563.29518
##  9  Bangladesh   817.5588   67.86165
## 10     Belgium 19900.7581 2422.32683
## # ... with 132 more rows

Challenge: I want the mean, se and sample size of life expectancy by continent n() = easier way to find “sample size” instead of length (exists only within summarise or another fnx) you can group_by multiple variables (just by adding a comma)

mean_lifeExp_percontinent <- gapminder %>%
  group_by(continent,country) %>% 
  summarise(mean_lifeExp=mean(lifeExp),
            se_lifeExp=se(lifeExp),
            length_lifeExp=n())

mean_lifeExp_percontinent
## # A tibble: 142 x 5
## # Groups:   continent [?]
##    continent                  country mean_lifeExp se_lifeExp
##       <fctr>                   <fctr>        <dbl>      <dbl>
##  1    Africa                  Algeria     59.03017  2.9849208
##  2    Africa                   Angola     37.88350  1.1562236
##  3    Africa                    Benin     48.77992  1.7691977
##  4    Africa                 Botswana     54.59750  1.7116922
##  5    Africa             Burkina Faso     44.69400  1.9762099
##  6    Africa                  Burundi     44.81733  0.9165096
##  7    Africa                 Cameroon     48.12850  1.5784640
##  8    Africa Central African Republic     43.86692  1.3627459
##  9    Africa                     Chad     46.77358  1.4110376
## 10    Africa                  Comoros     52.38175  2.3476081
## # ... with 132 more rows, and 1 more variables: length_lifeExp <int>

combining ggplot and dplyr easy tools to save = ggsave and write.cvs (“/” denotes what folder you put it in - don’t put it in your “data” folder because there is a risk of over ridding your original data)

euro_countries <- gapminder %>%
  filter(continent=="Europe") %>% 
  ggplot(aes(x=year,y=lifeExp,color=country)) +
  geom_line() +
  facet_wrap(~country)

euro_countries

ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry, "processed/mean_gdp_percountry.cvs")

Data manipulation with tidyr

make sure you use the right data frame (gapminder_wide vs. gapmidner_long)

a longer way to code this data… gap_long <- gapminder_wide %>% gather(obstype_year, obs_values, starts_with(‘pop’), starts_with(‘lifeExp’), starts_with(‘gdpPercap’) )

# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")
              
gapminder_wide <- read.csv("data/gapminder_wide.csv")

gap_long <- gapminder_wide %>% 
  gather(obstype_year,
         obs_values,
         3:38)

head(gap_long)
##   continent      country   obstype_year obs_values
## 1    Africa      Algeria gdpPercap_1952  2449.0082
## 2    Africa       Angola gdpPercap_1952  3520.6103
## 3    Africa        Benin gdpPercap_1952  1062.7522
## 4    Africa     Botswana gdpPercap_1952   851.2411
## 5    Africa Burkina Faso gdpPercap_1952   543.2552
## 6    Africa      Burundi gdpPercap_1952   339.2965

seperate the obs_type column

gap_normal <- gap_long %>% 
  separate(obstype_year,into=c("obs_type", "obs_year"
),sep="_") %>% 
  spread(obs_type,obs_values)

head(gap_normal)
##   continent country obs_year gdpPercap lifeExp      pop
## 1    Africa Algeria     1952  2449.008  43.077  9279525
## 2    Africa Algeria     1957  3013.976  45.685 10270856
## 3    Africa Algeria     1962  2550.817  48.303 11000948
## 4    Africa Algeria     1967  3246.992  51.407 12760499
## 5    Africa Algeria     1972  4182.664  54.518 14760787
## 6    Africa Algeria     1977  4910.417  58.014 17152804
all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component \"lifeExp\": Mean relative difference: 0.203822"                                             
## [11] "Component 6: Mean relative difference: 4101.546"

R likes to have ‘long’ format data where every row is an observation and you have a single column for ‘observations’ the others serve to identify that observation. (exceptions apply when you have multiple types of observations) To switch back and forth from ‘wide’ (how we typically enter data in a spreadsheet) to ‘long’ use tidyr